{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## AdaBoost\n",
    "\n",
    "原理参考\n",
    "\n",
    "[知乎：手把手教你AdaBoost](https://zhuanlan.zhihu.com/p/27126737?utm_source=wechat_session&utm_medium=social&utm_oi=1044170377731248128)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-01-13T08:21:43.288212Z",
     "start_time": "2020-01-13T08:21:43.283227Z"
    }
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "x = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]\n",
    "len(x)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-01-13T08:21:43.714331Z",
     "start_time": "2020-01-13T08:21:43.709381Z"
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "y = [1, 1, 1, -1, -1, -1, 1, 1, 1, -1]\n",
    "y"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "基学习算法：二分类器，就给个阈值"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 第一个个体学习器\n",
    "\n",
    "我们首先认为$x_i$(i=1,2,…,10)的权重是一样的，即每一个数据同等重要。（权重是用来计算误差的）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-01-13T08:21:51.003882Z",
     "start_time": "2020-01-13T08:21:50.997897Z"
    }
   },
   "outputs": [],
   "source": [
    "w_1 = []\n",
    "for i in range(len(x)):\n",
    "    w_1.append(0.1)\n",
    "w_1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-01-13T08:22:04.065962Z",
     "start_time": "2020-01-13T08:22:04.052997Z"
    }
   },
   "outputs": [],
   "source": [
    "data = {'x':x, 'y':y,'w_1':w_1}\n",
    "pd.DataFrame(data).T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-01-13T07:48:19.491217Z",
     "start_time": "2020-01-13T07:48:19.486195Z"
    }
   },
   "outputs": [],
   "source": [
    "# 设置阈值\n",
    "threshs = [i - 0.5 for i in x]\n",
    "threshs.append(9.5)\n",
    "threshs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-01-13T07:48:20.165843Z",
     "start_time": "2020-01-13T07:48:20.159861Z"
    }
   },
   "outputs": [],
   "source": [
    "# 小于阈值的为1，大于阈值的为-1\n",
    "# 根据每个阈值进行预测\n",
    "# 以thresh = 2.5 为例\n",
    "y_pres = []\n",
    "thresh = 2.5\n",
    "for i in range(len(x)):\n",
    "    if x[i] < thresh:\n",
    "        y_pres.append(1)\n",
    "    else:\n",
    "        y_pres.append(-1)\n",
    "y_pres"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-01-13T07:48:21.074298Z",
     "start_time": "2020-01-13T07:48:21.060297Z"
    }
   },
   "outputs": [],
   "source": [
    "y_pres_all_thresh = {}\n",
    "for thresh in threshs:\n",
    "    y_pres = []\n",
    "    for i in range(len(x)):\n",
    "        if x[i] < thresh:\n",
    "            y_pres.append(1)\n",
    "        else:\n",
    "            y_pres.append(-1)\n",
    "    y_pres_all_thresh[thresh] = y_pres\n",
    "pd.DataFrame(y_pres_all_thresh).T\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-01-13T07:48:21.552987Z",
     "start_time": "2020-01-13T07:48:21.547005Z"
    }
   },
   "outputs": [],
   "source": [
    "# 计算每个阈值的错误率\n",
    "e = 0\n",
    "thresh = 1.5\n",
    "for i in range(len(y)):\n",
    "    if y_pres_all_thresh[thresh][i] != y[i]:\n",
    "        e+= w[i]\n",
    "e"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-01-13T07:48:21.921555Z",
     "start_time": "2020-01-13T07:48:21.909587Z"
    }
   },
   "outputs": [],
   "source": [
    "# 计算所有阈值的错误率\n",
    "e_s = {}\n",
    "e_min = len(x)\n",
    "n = 0\n",
    "for thresh in threshs:\n",
    "    e = 0\n",
    "    for i in range(len(y)):\n",
    "        if y_pres_all_thresh[thresh][i] != y[i]:\n",
    "            e+= w [i]\n",
    "    e_s[thresh] = round(e,6)\n",
    "    if e < e_min:\n",
    "        e_min = round(e, 6)\n",
    "        n = thresh\n",
    "e_s"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-01-13T07:48:22.509670Z",
     "start_time": "2020-01-13T07:48:22.505682Z"
    }
   },
   "outputs": [],
   "source": [
    "# 最小错误率\n",
    "e_min"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-01-13T07:48:22.789921Z",
     "start_time": "2020-01-13T07:48:22.785933Z"
    }
   },
   "outputs": [],
   "source": [
    "# 最小错误率对应阈值（同样错误率选小的阈值）\n",
    "n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-01-13T08:10:16.392680Z",
     "start_time": "2020-01-13T08:10:16.379678Z"
    }
   },
   "outputs": [],
   "source": [
    "data['pre_1'] = y_pres_all_thresh[n]\n",
    "pd.DataFrame(data).T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-01-13T08:08:39.222280Z",
     "start_time": "2020-01-13T08:08:39.217297Z"
    }
   },
   "outputs": [],
   "source": [
    "# 计算模型权重a\n",
    "import math\n",
    "\n",
    "a_1 = 0.5 * math.log((1 - e_min)/e_min)\n",
    "a_1 = round(a, 6)\n",
    "a_1"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "这里可以看出，错误率为什么必须小于0.5，否则a<0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-01-13T08:08:40.020166Z",
     "start_time": "2020-01-13T08:08:40.012188Z"
    }
   },
   "outputs": [],
   "source": [
    "# 更新权重\n",
    "\n",
    "w_2_tmp = []\n",
    "\n",
    "for i in range(len(x)):\n",
    "    w_new = w[i]*math.exp(-1*a*y[i]*y_pres_all_thresh[n][i])\n",
    "    w_new = round(w_new, 6)\n",
    "    w_2_tmp.append(w_new)\n",
    "\n",
    "w_2_tmp"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-01-13T08:08:40.199789Z",
     "start_time": "2020-01-13T08:08:40.193806Z"
    }
   },
   "outputs": [],
   "source": [
    "z = 0\n",
    "for i in range(len(w_2_tmp)):\n",
    "    z += w_2_tmp[i]\n",
    "z"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-01-13T08:08:40.398266Z",
     "start_time": "2020-01-13T08:08:40.392275Z"
    }
   },
   "outputs": [],
   "source": [
    "z = sum(w_2_tmp)\n",
    "z"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-01-13T08:08:40.591741Z",
     "start_time": "2020-01-13T08:08:40.586756Z"
    }
   },
   "outputs": [],
   "source": [
    "w_2 = [round(i/z, 5) for i in w_2_tmp]\n",
    "w_2"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "可以看到x=6,7,8的数据的权重变大了，而其他数据的权重降低了，这是希望能把之前经常分类错误（经常分类错误会出现权重不断变大）的数据能在下一个个体学习器分类正确（记住：权重是用来计算误差的，为了降低误差，选择阈值时会倾向把权重大的分类正确）\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-01-13T08:11:01.418553Z",
     "start_time": "2020-01-13T08:11:01.413567Z"
    }
   },
   "outputs": [],
   "source": [
    "data['pre_1']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-01-13T08:14:15.161661Z",
     "start_time": "2020-01-13T08:14:15.145983Z"
    }
   },
   "outputs": [],
   "source": [
    "g_1_s = []\n",
    "for i in range(len(x)):\n",
    "    g_1 = a * data['pre_1'][i]\n",
    "    g_1_s.append(g_1)\n",
    "data['g_1'] =  g_1_s\n",
    "pd.DataFrame(data).T"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def func_AdaBoost(x, y, T):\n",
    "    \n",
    "    # 储存变量的列表\n",
    "    a_n_s = [] # 存放个体学习器权重\n",
    "    w_n_s = [] # 存放样本权重\n",
    "    pre_n_s = [] # 存放预测值\n",
    "    g_n_s = []  # 存放加权后预测值\n",
    "    n_s = []\n",
    "\n",
    "    \n",
    "    # 权重初始化\n",
    "    w_1 = func_w_1(x)    \n",
    "    w_n_s.append(w_1)\n",
    "    \n",
    "    # 生成阈值\n",
    "    threshs = func_threshs(x)\n",
    "    \n",
    "    for i in range(3):  \n",
    "    \n",
    "        # 变量权重\n",
    "        w_n = w_n_s[i]\n",
    "        # 所有阈值二分法结果 positive 小于阈值为1，大于阈值为-1，negative相反\n",
    "        y_pres_all_thresh_tmp_ps, y_pres_all_thresh_tmp_ng = func_y_pres_all_thresh_tmp(threshs)\n",
    "        # 计算一个阈值尝试两种判定，计算所有情况加权后错误率，最小错误率，及对应的阈值\n",
    "        e_s, e_min, n, y_pres_all_thresh = func_e_s(x, y, threshs, w_n, y_pres_all_thresh_tmp_ps, y_pres_all_thresh_tmp_ng)\n",
    "        # 获得最佳阈值对应预测\n",
    "        pre_n = func_pre_n(y_pres_all_thresh, n)\n",
    "        # 计算模型权重a\n",
    "        a_n = func_a_n(e_min)\n",
    "        # 计算权重暂时值\n",
    "        w_n_tmp = func_w_n_tmp(x, pre_n,w_n,a_n)\n",
    "        # 计算z\n",
    "        z_n = func_z_n(w_n_tmp)\n",
    "        # 计算调整后权重\n",
    "        w_n = func_w_n(w_n_tmp, z_n)\n",
    "        # 计算第一个模型加权后的预测\n",
    "        g_n = func_g_n(x, pre_n, a_n)\n",
    "        \n",
    "        # 数据存储\n",
    "        a_n_s.append(a_n) # 存放个体学习器权重\n",
    "        w_n_s.append(w_n)# 存放样本权重\n",
    "        pre_n_s.append(pre_n)# 存放预测值\n",
    "        g_n_s.append(g_n)\n",
    "        n_s.append(n)\n",
    "\n",
    "        \n",
    "    return a_n_s, w_n_s, pre_n_s, g_n_s, n_s"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  },
  "nbTranslate": {
   "displayLangs": [
    "*"
   ],
   "hotkey": "alt-t",
   "langInMainMenu": true,
   "sourceLang": "en",
   "targetLang": "fr",
   "useGoogleTranslate": true
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
